/*    Copyright 2010 Tobias Marschall
 *
 *    This file is part of MoSDi.
 *
 *    MoSDi is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    MoSDi is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with MoSDi.  If not, see <http://www.gnu.org/licenses/>.
 */

package mosdi.subcommands;

import java.util.ArrayList;
import java.util.List;

import mosdi.fa.Alphabet;
import mosdi.fa.CDFA;
import mosdi.fa.DFAFactory;
import mosdi.fa.GeneralizedString;
import mosdi.util.Iupac;
import mosdi.util.Log;
import mosdi.util.NamedSequence;
import mosdi.util.SequenceUtils;

public class AnnotateSubcommand extends Subcommand {

	@Override
	public String usage() {
		return
		super.usage()+" [options] <motif> <fasta-file>\n" +
		"\n" +
		"Options:\n" +
		"  -r: output matches of reverse complementary motif";
	}


	@Override
	public String description() {
		return "Creates an annotation track in EMBL format that contains all matches of a given IUPAC pattern.";
	}


	@Override
	public String name() {
		return "annotate";
	}


	@Override
	public int run(String[] args) {
		parseOptions(args, 2, "r");

		// Option dependencies
		// -- none --

		// Mandatory arguments
		String pattern = getStringArgument(0);
		String fastaFilename = getStringArgument(1);

		// Options
		boolean considerReverse = getBooleanOption("r", false);

		Alphabet dnaAlphabet = Alphabet.getDnaAlphabet();
		List<NamedSequence> namedSequences = null;

		try {
			namedSequences = SequenceUtils.readFastaFile(fastaFilename, dnaAlphabet);
		} catch (Exception e) {
			Log.errorln(e.toString());
			return 1;
		}

		if (namedSequences.size()!=1) {
			Log.errorln("Error: input file must contain exactly one sequence.");
			return 1;
		}
		int[] sequence = namedSequences.get(0).getSequence();

		// construct automaton
		AnnotateSubcommand.outputEmblAnnotations(sequence, pattern, false);
		if (considerReverse) AnnotateSubcommand.outputEmblAnnotations(sequence, pattern, true);
		return 0;
	}

	private static void outputEmblAnnotations(int[] sequence, String pattern, boolean reverse) {
		Alphabet dnaAlphabet = Alphabet.getDnaAlphabet();
		List<GeneralizedString> l = new ArrayList<GeneralizedString>(1);
		if (reverse) {
			l.add(Iupac.toGeneralizedString(Iupac.reverseComplementary(pattern)));
		} else {
			l.add(Iupac.toGeneralizedString(pattern));
		}
		CDFA cdfa = DFAFactory.build(dnaAlphabet, l, 10000);
		List<CDFA.MatchPosition> matchPositions = cdfa.findMatchPositions(sequence);
		for (CDFA.MatchPosition mp : matchPositions) {
			int end = mp.getPosition();
			int start = end-pattern.length()+1;
			if (reverse) {
				Log.printf(Log.Level.STANDARD, "FT CDS complement(%d..%d)\nFT     /note=\"Motif %s\"%n",start,end,pattern);
			} else {
				Log.printf(Log.Level.STANDARD, "FT CDS %d..%d\nFT     /note=\"Motif %s\"%n",start,end,pattern);
			}
		}
	}	
}
